{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In this example, the stacker was handled with cross-validation instead of a train-test-split.\n",
    "The whole training set was used in both of the two stacking phases."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from __future__ import division\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.datasets import fetch_california_housing\n",
    "\n",
    "cali_housing = fetch_california_housing()\n",
    "\n",
    "X = cali_housing.data\n",
    "y = cali_housing.target\n",
    "\n",
    "bins = np.arange(6)\n",
    " \n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "binned_y = np.digitize(y, bins)\n",
    "\n",
    "from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor\n",
    " \n",
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "X_train_prin, X_test_prin, y_train_prin, y_test_prin = train_test_split(X, y,test_size=0.2,stratify=binned_y,random_state=7)\n",
    "\n",
    "binned_y_train_prin = np.digitize(y_train_prin, bins)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\nfor train_index, test_index in skf.split(X_train_prin, binned_y_train_prin):\\n    print(\"TRAIN:\", train_index, \"TEST:\", test_index)\\n    print (pd.Series(binned_y_train_prin[test_index]).value_counts())\\n    \\n'"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.model_selection import StratifiedKFold\n",
    "\n",
    "skf = StratifiedKFold(n_splits = 3, random_state = 7)\n",
    "skf.split(X_train_prin, binned_y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/julian/anaconda3/envs/bunnies/lib/python3.5/site-packages/sklearn/model_selection/_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=7, shuffle=False),\n",
       "          error_score='raise',\n",
       "          estimator=BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=1.0,\n",
       "         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,\n",
       "         random_state=None, verbose=0, warm_start=False),\n",
       "          fit_params=None, iid=True, n_iter=5, n_jobs=-1,\n",
       "          param_distributions={'max_features': [0.5, 1.0], 'oob_score': [True, False], 'base_estimator__n_neighbors': [3, 5], 'max_samples': [0.5, 1.0], 'n_estimators': [100]},\n",
       "          pre_dispatch='2*n_jobs', random_state=7, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import BaggingRegressor\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {\n",
    " 'max_samples': [0.5,1.0],\n",
    " 'max_features' : [0.5,1.0],\n",
    " 'oob_score' : [True, False],\n",
    " 'base_estimator__n_neighbors': [3,5],\n",
    " 'n_estimators': [100]\n",
    " }\n",
    "\n",
    "single_estimator = KNeighborsRegressor()\n",
    "ensemble_estimator = BaggingRegressor(base_estimator = single_estimator)\n",
    "\n",
    "pre_gs_inst_bag = RandomizedSearchCV(ensemble_estimator,\n",
    "                                     param_distributions = param_dist,\n",
    "                                     cv = skf,\n",
    "                                     n_iter = 5,\n",
    "                                     n_jobs=-1,\n",
    "                                    random_state=7)\n",
    "\n",
    "pre_gs_inst_bag.fit(X_train_prin, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'base_estimator__n_neighbors': 5,\n",
       " 'max_features': 0.5,\n",
       " 'max_samples': 1.0,\n",
       " 'n_estimators': 100,\n",
       " 'oob_score': True}"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pre_gs_inst_bag.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "rs_bag = BaggingRegressor(**{'max_features': 0.5,\n",
    " 'max_samples': 1.0,\n",
    " 'n_estimators': 3000,\n",
    " 'oob_score': True, \n",
    " 'base_estimator': KNeighborsRegressor(n_neighbors=5)})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/julian/anaconda3/envs/bunnies/lib/python3.5/site-packages/sklearn/model_selection/_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import cross_val_predict\n",
    "\n",
    "bag_predicted = cross_val_predict(rs_bag, X_train_prin, y_train_prin, cv=skf, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/julian/anaconda3/envs/bunnies/lib/python3.5/site-packages/sklearn/model_selection/_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=7, shuffle=False),\n",
       "          error_score='raise',\n",
       "          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n",
       "             max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "             min_impurity_split=None, min_samples_leaf=1,\n",
       "             min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "             n_estimators=100, presort='auto', random_state=None,\n",
       "             subsample=1.0, verbose=0, warm_start=True),\n",
       "          fit_params=None, iid=True, n_iter=30, n_jobs=-1,\n",
       "          param_distributions={'max_features': ['log2', 0.4, 0.5, 0.6, 1.0], 'n_estimators': [50, 100], 'learning_rate': [0.01, 0.05, 0.1, 0.25, 0.275, 0.3, 0.325], 'max_depth': [2, 3, 4, 5, 6, 7, 10], 'min_samples_leaf': [1, 2, 3, 4, 5, 10], 'loss': ['ls', 'huber']},\n",
       "          pre_dispatch='2*n_jobs', random_state=7, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {'max_features' : ['log2',0.4,0.5,0.6,1.0],\n",
    " 'max_depth' : [2,3, 4, 5,6, 7, 10],\n",
    " 'min_samples_leaf' : [1,2, 3, 4, 5, 10],\n",
    " 'n_estimators': [50, 100],\n",
    " 'learning_rate' : [0.01,0.05,0.1,0.25,0.275,0.3,0.325],\n",
    " 'loss' : ['ls','huber']\n",
    " }\n",
    "pre_gs_inst_gb = RandomizedSearchCV(GradientBoostingRegressor(warm_start=True),\n",
    "                                   param_distributions = param_dist,\n",
    "                                   cv=skf, \n",
    "                                   n_iter = 30, \n",
    "                                   n_jobs=-1,random_state=7)\n",
    "pre_gs_inst_gb.fit(X_train_prin, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.25, loss='huber', max_depth=6,\n",
       "             max_features=1.0, max_leaf_nodes=None,\n",
       "             min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "             min_samples_leaf=10, min_samples_split=2,\n",
       "             min_weight_fraction_leaf=0.0, n_estimators=100,\n",
       "             presort='auto', random_state=None, subsample=1.0, verbose=0,\n",
       "             warm_start=True)"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pre_gs_inst_gb.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gbt_inst = GradientBoostingRegressor(**{'learning_rate': 0.25,\n",
    " 'loss': 'huber',\n",
    " 'max_depth': 6,\n",
    " 'max_features': 1.0,\n",
    " 'min_samples_leaf': 10,\n",
    " 'n_estimators': 3000,\n",
    " 'warm_start': True})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/julian/anaconda3/envs/bunnies/lib/python3.5/site-packages/sklearn/model_selection/_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    }
   ],
   "source": [
    "gbt_predicted = cross_val_predict(gbt_inst, X_train_prin, y_train_prin, cv=skf, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>bag</th>\n",
       "      <th>gbt</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>bag</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.878562</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gbt</th>\n",
       "      <td>0.878562</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          bag       gbt\n",
       "bag  1.000000  0.878562\n",
       "gbt  0.878562  1.000000"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_df = pd.DataFrame(X_train_prin.copy(),columns = cali_housing .feature_names )#pd.DataFrame(columns = ['bag', 'gbt'])\n",
    "\n",
    "preds_df['bag'] = bag_predicted\n",
    "preds_df['gbt'] = gbt_predicted\n",
    "\n",
    "\n",
    "preds_df[['bag','gbt']].corr()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(16512, 10)"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preds_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/julian/anaconda3/envs/bunnies/lib/python3.5/site-packages/sklearn/model_selection/_split.py:597: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=3.\n",
      "  % (min_groups, self.n_splits)), Warning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "RandomizedSearchCV(cv=StratifiedKFold(n_splits=3, random_state=7, shuffle=False),\n",
       "          error_score='raise',\n",
       "          estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,\n",
       "          max_features='auto', max_leaf_nodes=None,\n",
       "          min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "          min_samples_leaf=1, min_samples_split=2,\n",
       "          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n",
       "          oob_score=False, random_state=None, verbose=0, warm_start=True),\n",
       "          fit_params=None, iid=True, n_iter=15, n_jobs=1,\n",
       "          param_distributions={'max_features': ['sqrt', 'log2', 1.0], 'oob_score': [True, False], 'min_samples_leaf': [1, 2, 3, 7, 11], 'n_estimators': [50, 100]},\n",
       "          pre_dispatch='2*n_jobs', random_state=7, refit=True,\n",
       "          return_train_score=True, scoring=None, verbose=0)"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.ensemble import ExtraTreesRegressor\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_dist = {'max_features' : ['sqrt','log2',1.0],\n",
    " 'min_samples_leaf' : [1, 2, 3, 7, 11],\n",
    " 'n_estimators': [50, 100],\n",
    " 'oob_score': [True, False]}\n",
    "\n",
    "pre_gs_inst_etr = RandomizedSearchCV(ExtraTreesRegressor(warm_start=True,bootstrap=True),\n",
    "                                 param_distributions = param_dist,\n",
    "                                 cv=skf,\n",
    "                                 n_iter = 15,\n",
    "                                 random_state = 7)\n",
    "\n",
    "pre_gs_inst_etr.fit(preds_df.values, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'max_features': 1.0,\n",
       " 'min_samples_leaf': 11,\n",
       " 'n_estimators': 100,\n",
       " 'oob_score': False}"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pre_gs_inst_etr.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,\n",
       "          max_features=1.0, max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "          min_impurity_split=None, min_samples_leaf=11,\n",
       "          min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "          n_estimators=2000, n_jobs=1, oob_score=False, random_state=None,\n",
       "          verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_etr = ExtraTreesRegressor(**{'max_features': 1.0,\n",
    " 'min_samples_leaf': 11,\n",
    " 'n_estimators': 2000,\n",
    " 'oob_score': False})\n",
    "final_etr.fit(preds_df.values, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "BaggingRegressor(base_estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',\n",
       "          metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
       "          weights='uniform'),\n",
       "         bootstrap=True, bootstrap_features=False, max_features=0.5,\n",
       "         max_samples=1.0, n_estimators=3000, n_jobs=1, oob_score=True,\n",
       "         random_state=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rs_bag.fit(X_train_prin, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n",
       "             learning_rate=0.25, loss='huber', max_depth=6,\n",
       "             max_features=1.0, max_leaf_nodes=None,\n",
       "             min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "             min_samples_leaf=10, min_samples_split=2,\n",
       "             min_weight_fraction_leaf=0.0, n_estimators=3000,\n",
       "             presort='auto', random_state=None, subsample=1.0, verbose=0,\n",
       "             warm_start=True)"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gbt_inst.fit(X_train_prin, y_train_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "def handle_X_set(X_set):\n",
    "    X_copy = X_set.copy()\n",
    "    \n",
    "    y_pred_bag = rs_bag.predict(X_copy)\n",
    "    y_pred_gbt = gbt_inst.predict(X_copy)\n",
    "    preds_df = pd.DataFrame(X_copy, columns = cali_housing .feature_names)\n",
    "\n",
    "    preds_df['bag'] = y_pred_bag\n",
    "    preds_df['gbt'] = y_pred_gbt\n",
    " \n",
    "    return preds_df.values\n",
    "\n",
    "def predict_from_X_set(X_set):\n",
    "    return final_etr.predict(handle_X_set(X_set)) \n",
    "\n",
    "y_pred = predict_from_X_set(X_test_prin)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mase(y_test, y_pred):\n",
    "    y_avg = y_test.mean()\n",
    "    denominator = np.abs(y_test - y_avg).mean()\n",
    "    numerator = y_test - y_pred\n",
    "    \n",
    "    return np.abs(numerator/denominator).mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "R-squared 0.845963221997\n",
      "MAE   :   0.296563386223\n",
      "MAPE  :   0.165997578898\n",
      "SMAPE :   0.15271275169\n",
      "MASE  :   0.325969762163\n"
     ]
    }
   ],
   "source": [
    "# https://www.otexts.org/fpp/2/5 : contains SMAPE (attributed to Armstrong) and MASE (Hyndman and Koehler)\n",
    "from sklearn.metrics import r2_score, mean_absolute_error\n",
    "\n",
    "print (\"R-squared\",r2_score(y_test_prin, y_pred))\n",
    "print (\"MAE   :  \",mean_absolute_error(y_test_prin, y_pred))\n",
    "print (\"MAPE  :  \",(np.abs(y_test_prin- y_pred)/y_test_prin).mean())\n",
    "print (\"SMAPE :  \",(np.abs(y_test_prin- y_pred)/((y_test_prin + y_pred)/2)).mean())\n",
    "print (\"MASE  :  \",mase(y_test_prin, y_pred)) "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python cine",
   "language": "python",
   "name": "bunnies"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
